In this project I'll explore datasets on staff retention
By using different machine learning models I'll predict if an employee is or not about to leave the organization
tools used:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix as confucio
from sklearn.model_selection import train_test_split
import numpy as np
from imblearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB,BernoulliNB
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
import category_encoders as ce
from collections import Counter
def evaluate_model(dt_classifier):
print("Train Accuracy :", accuracy_score(y_train, dt_classifier.predict(X_train)))
print("Train Confusion Matrix:")
print(confucio(y_train, dt_classifier.predict(X_train)))
print("-"*50)
print("Test Accuracy :", accuracy_score(y_test, dt_classifier.predict(X_test)))
print("Test Confusion Matrix:")
print(confucio(y_test, dt_classifier.predict(X_test)))
#DATASET 1
df = pd.read_csv('C:/Users/santi/OneDrive/Desktop/HR_comma_sep.csv')
df.columns
#distinct values for some categorical data
unique_list = []
for item in df['salary']:
if item not in unique_list:
unique_list.append(item)
print(unique_list)
len(unique_list)
#Some data cleaning
#renaming columns names
df = df.rename(str.lower, axis='columns')
df = df.rename(columns={'sales': 'department'})
df.columns
#checking null values and shape
df.shape
df = df.dropna(axis=0)
df.isna().sum()
['low', 'medium', 'high']
satisfaction_level 0 last_evaluation 0 number_project 0 average_montly_hours 0 time_spend_company 0 work_accident 0 left 0 promotion_last_5years 0 department 0 salary 0 dtype: int64
#Genreate PLOTS
df['left']=df['left'].replace(0,'No')
df['left']=df['left'].replace(1,'Yes')
g = sns.countplot(data=df, x="left")
g.set_xlabel('left',fontsize=15)
g.set_ylabel('Count',fontsize=15)
g.tick_params(labelsize=11)
g.set_title('Dataset 1 Attrition count plot')
plt.show()
df['left']=df['left'].replace('No',0)
df['left']=df['left'].replace('Yes',1)
df['department']=df['department'].replace('sales',0)
df['department']=df['department'].replace('accounting',1)
df['department']=df['department'].replace('hr',2)
df['department']=df['department'].replace('technical',3)
df['department']=df['department'].replace('support',4)
df['department']=df['department'].replace('management',5)
df['department']=df['department'].replace('IT',6)
df['department']=df['department'].replace('product_mng',7)
df['department']=df['department'].replace('marketing',8)
df['department']=df['department'].replace('RandD',9)
df['salary']=df['salary'].replace('low',1)
df['salary']=df['salary'].replace('medium',2)
df['salary']=df['salary'].replace('high',3)
#ditribution plot
df1 = df.iloc[: , :6]
df2 = df.iloc[: , 6:12]
sns.color_palette("tab10")
n_rows=2
n_cols=3
palette = ('dodgerblue' ,'red','blue' ,'orange','black' ,'purple')
palette = iter(palette)
# Create the subplots
fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols,figsize=(11, 8))
for i, column in enumerate(df1.columns):
c = next(palette)
sns.distplot(df1[column],ax=axes[i//n_cols,i%n_cols],color=c)
n_rows=2
n_cols=3
palette = ('green' ,'black','gold' ,'magenta','cyan' ,'deeppink')
palette = iter(palette)
# Create the subplots
fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols,figsize=(11, 8))
for i, column in enumerate(df2.columns):
c = next(palette)
sns.distplot(df2[column],ax=axes[i//n_cols,i%n_cols],color=c)
df3d = df.sample(n=5000)
#3D plot
fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(projection='3d')
scatter = ax.scatter(df3d['satisfaction_level'],
df3d['salary'],
df3d['average_montly_hours'],
s = df3d['satisfaction_level']*(200),
c = df3d['left'],
cmap='viridis', alpha=0.3,linewidth=1.5,edgecolors='black')
ax.set_xlabel('satisfaction level', fontsize=20)
ax.set_ylabel('salary', fontsize=20)
ax.set_zlabel('montly hours', fontsize=20)
handles, labels = scatter.legend_elements(prop="colors", alpha=1)
legend = ax.legend(handles, labels, loc='upper left',fontsize = 17,shadow = True)
legend.set_title('left',prop={'size':'large'})
ax.set_title('* points Sized by satisfaction level ')
plt.show()
# this next plot is taking a while to load
sns.pairplot(df3d,hue='left')
plt.show()
df['department']=df['department'].replace(0,'sales')
df['department']=df['department'].replace(1,'accounting')
df['department']=df['department'].replace(2,'hr')
df['department']=df['department'].replace(3,'technical')
df['department']=df['department'].replace(4,'support')
df['department']=df['department'].replace(5,'management')
df['department']=df['department'].replace(6,'IT')
df['department']=df['department'].replace(7,'product_mng')
df['department']=df['department'].replace(8,'marketing')
df['department']=df['department'].replace(9,'RandD')
df['salary']=df['salary'].replace(1,'low')
df['salary']=df['salary'].replace(2,'medium')
df['salary']=df['salary'].replace(3,'high')
df.isna().sum
C:\Users\santi\AppData\Local\Temp\ipykernel_17924\3107865022.py:43: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df1[column],ax=axes[i//n_cols,i%n_cols],color=c) C:\Users\santi\AppData\Local\Temp\ipykernel_17924\3107865022.py:43: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df1[column],ax=axes[i//n_cols,i%n_cols],color=c) C:\Users\santi\AppData\Local\Temp\ipykernel_17924\3107865022.py:43: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df1[column],ax=axes[i//n_cols,i%n_cols],color=c) C:\Users\santi\AppData\Local\Temp\ipykernel_17924\3107865022.py:43: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df1[column],ax=axes[i//n_cols,i%n_cols],color=c) C:\Users\santi\AppData\Local\Temp\ipykernel_17924\3107865022.py:43: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df1[column],ax=axes[i//n_cols,i%n_cols],color=c) C:\Users\santi\AppData\Local\Temp\ipykernel_17924\3107865022.py:43: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df1[column],ax=axes[i//n_cols,i%n_cols],color=c) C:\Users\santi\AppData\Local\Temp\ipykernel_17924\3107865022.py:53: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df2[column],ax=axes[i//n_cols,i%n_cols],color=c) C:\Users\santi\AppData\Local\Temp\ipykernel_17924\3107865022.py:53: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df2[column],ax=axes[i//n_cols,i%n_cols],color=c) C:\Users\santi\AppData\Local\Temp\ipykernel_17924\3107865022.py:53: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df2[column],ax=axes[i//n_cols,i%n_cols],color=c) C:\Users\santi\AppData\Local\Temp\ipykernel_17924\3107865022.py:53: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df2[column],ax=axes[i//n_cols,i%n_cols],color=c)
<bound method NDFrame._add_numeric_operations.<locals>.sum of satisfaction_level last_evaluation number_project \
0 False False False
1 False False False
2 False False False
3 False False False
4 False False False
... ... ... ...
14994 False False False
14995 False False False
14996 False False False
14997 False False False
14998 False False False
average_montly_hours time_spend_company work_accident left \
0 False False False False
1 False False False False
2 False False False False
3 False False False False
4 False False False False
... ... ... ... ...
14994 False False False False
14995 False False False False
14996 False False False False
14997 False False False False
14998 False False False False
promotion_last_5years department salary
0 False False False
1 False False False
2 False False False
3 False False False
4 False False False
... ... ... ...
14994 False False False
14995 False False False
14996 False False False
14997 False False False
14998 False False False
[14999 rows x 10 columns]>
# x and y
y = df['left']
x = df.drop(columns=['left'])
print(x.columns)
import category_encoders as ce
#encoder
ce = ce.LeaveOneOutEncoder(cols=['work_accident','promotion_last_5years', 'department', 'salary'],return_df=True)
x = ce.fit_transform(x,y)
Index(['satisfaction_level', 'last_evaluation', 'number_project',
'average_montly_hours', 'time_spend_company', 'work_accident',
'promotion_last_5years', 'department', 'salary'],
dtype='object')
#Feature selection and splitting
# decision tree for feature importance on a classification problem
model = RandomForestClassifier()
#fit
model.fit(x, y)
# get importance
importance = model.feature_importances_
# summarize feature importance
for i,v in enumerate(importance):
print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
plt.bar([x for x in range(len(importance))], importance)
plt.show()
x = x[['promotion_last_5years','work_accident','satisfaction_level']]
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.30)
#ADASYN Oversampling
print("Before sampling: ", Counter(y_train))
from imblearn.over_sampling import ADASYN
ada = ADASYN(random_state=42)
X_train, y_train = ada.fit_resample(x, y)
print("After ADASYN: ", Counter(y_train))
cor = X_train.corr()
plt.figure(figsize=(12,10))
sns.heatmap(cor, cmap=plt.cm.CMRmap_r,annot=True)
plt.show()
Feature: 0, Score: 0.06910 Feature: 1, Score: 0.01024 Feature: 2, Score: 0.03327 Feature: 3, Score: 0.01428 Feature: 4, Score: 0.02712 Feature: 5, Score: 0.25458 Feature: 6, Score: 0.56574 Feature: 7, Score: 0.00057 Feature: 8, Score: 0.02510
Before sampling: Counter({0: 7958, 1: 2541})
After ADASYN: Counter({1: 11429, 0: 11428})
ADA Boost
#Modelling
#ADA
# Create adaboost classifer object
abc = AdaBoostClassifier(n_estimators=50,
learning_rate=1)
# Train Adaboost Classifer
model = abc.fit(X_train, y_train)
#Predict the response for test dataset
y_pred = model.predict(X_test)
#train model with cv of 10
cv_scores = cross_val_score(abc, X_train, y_train, cv=10)
#print each cv score (accuracy) and average them
print(cv_scores)
print('cv_scores mean:{}'.format(np.mean(cv_scores)))
ADA = {'model': 'ADA' ,'precision':precision_score(y_test, y_pred, average='macro'),
'recall':recall_score(y_test, y_pred, average='macro'),
'accuracy': accuracy_score(y_test, y_pred),
'f1': f1_score(y_test, y_pred , average = 'binary'),
'ROC AUC': roc_auc_score(y_test, y_pred) }
CM_ADA = confucio(y_test, y_pred)
print('ADA')
evaluate_model(abc)
[1. 1. 1. 0.99912511 1. 1. 1. 1. 1. 1. ] cv_scores mean:0.999912510936133 ADA Train Accuracy : 1.0 Train Confusion Matrix: [[11428 0] [ 0 11429]] -------------------------------------------------- Test Accuracy : 1.0 Test Confusion Matrix: [[3470 0] [ 0 1030]]
KNN
#knn
knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train, y_train)
print(knn.score(X_test, y_test))
#Checking for k value
neighbors = np.arange(1, 9)
train_accuracy = np.empty(len(neighbors))
test_accuracy = np.empty(len(neighbors))
# Loop over K values
for i, k in enumerate(neighbors):
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train, y_train)
# Compute training and test data accuracy
train_accuracy[i] = knn.score(X_train, y_train)
test_accuracy[i] = knn.score(X_test, y_test)
# Generate plot
plt.plot(neighbors, test_accuracy, label = 'Testing dataset Accuracy')
plt.plot(neighbors, train_accuracy, label = 'Training dataset Accuracy')
plt.legend()
plt.xlabel('n_neighbors')
plt.ylabel('Accuracy')
plt.show()
#cv
#create a new KNN model
knn_cv = KNeighborsClassifier(n_neighbors=1)
#train model with cv of 5
cv_scores = cross_val_score(knn_cv, X_train, y_train, cv=10)
#print each cv score (accuracy) and average them
print(cv_scores)
print('cv_scores mean:{}'.format(np.mean(cv_scores)))
KNN = {'model': 'KNN' ,'precision':precision_score(y_test, y_pred, average='macro'),
'recall':recall_score(y_test, y_pred, average='macro'),
'accuracy': accuracy_score(y_test, y_pred),
'f1': f1_score(y_test, y_pred , average = 'binary'),
'ROC AUC': roc_auc_score(y_test, y_pred) }
CM_KNN = confucio(y_test, y_pred)
print('KNN')
evaluate_model(knn)
0.9997777777777778
[0.99956255 0.99781277 0.99868766 0.99956255 0.92650919 0.96456693 0.97156605 1. 0.99956236 1. ] cv_scores mean:0.9857830079773946 KNN Train Accuracy : 0.991206195038719 Train Confusion Matrix: [[11405 23] [ 178 11251]] -------------------------------------------------- Test Accuracy : 0.9982222222222222 Test Confusion Matrix: [[3463 7] [ 1 1029]]
Naive Bayes
#Naive Bayes
nb_model = Pipeline([
('classification', GaussianNB())
])
nb_model.get_params().keys()
nb_clf = GridSearchCV(estimator=nb_model, param_grid={}, scoring='recall', cv=5)
nb_clf.fit(X_train,y_train )
y_pred = nb_clf.predict(X_test)
model_nb_cm = confucio(y_test, y_pred)
model_nb_result = []
model_nb_result.append(precision_score(y_test, y_pred, average='macro'))
model_nb_result.append(recall_score(y_test, y_pred, average='macro'))
model_nb_result.append(accuracy_score(y_test, y_pred))
model_nb_result.append(f1_score(y_test, y_pred , average = 'binary'))
NB = {'model': 'NB' ,'precision':precision_score(y_test, y_pred, average='macro'),
'recall':recall_score(y_test, y_pred, average='macro'),
'accuracy': accuracy_score(y_test, y_pred),
'f1': f1_score(y_test, y_pred , average = 'binary'),
'ROC AUC': roc_auc_score(y_test, y_pred) }
CM_NB = confucio(y_test, y_pred)
print('NB')
evaluate_model(nb_clf)
NB Train Accuracy : 0.6638666491665572 Train Confusion Matrix: [[8694 2734] [4949 6480]] -------------------------------------------------- Test Accuracy : 0.6928888888888889 Test Confusion Matrix: [[2592 878] [ 504 526]]
Decision Tree
#Decision tree
clf = tree.DecisionTreeClassifier(criterion='entropy', splitter='best', max_depth=7)
clf = clf.fit(X_train, y_train)
tree.plot_tree(clf)
y_pred = clf.predict(X_test)
TREE = {'model': 'TREE' ,'precision':precision_score(y_test, y_pred, average='macro'),
'recall':recall_score(y_test, y_pred, average='macro'),
'accuracy': accuracy_score(y_test, y_pred),
'f1': f1_score(y_test, y_pred , average = 'binary'),
'ROC AUC': roc_auc_score(y_test, y_pred) }
CM_TREE = confucio(y_test, y_pred)
print('tree')
evaluate_model(clf)
tree Train Accuracy : 1.0 Train Confusion Matrix: [[11428 0] [ 0 11429]] -------------------------------------------------- Test Accuracy : 1.0 Test Confusion Matrix: [[3470 0] [ 0 1030]]
Random Forest
#Random forest
dt = RandomForestClassifier(random_state=42)
params = {
'max_depth': [1,2,3,4,5,6,7,8,9,10,15, 20],
'min_samples_leaf': [1,2,3,4,5,6,7,8,9, 10,15, 20, 50, 100],
'criterion': ["gini", "entropy"]
}
grid_search = GridSearchCV(estimator=dt,
param_grid=params,
cv=5, n_jobs=-1, verbose=1, scoring = "accuracy")
grid_search.fit(X_train, y_train)
score_df = pd.DataFrame(grid_search.cv_results_)
score_df.head()
score_df.nlargest(5,"mean_test_score")
dt_best = grid_search.best_estimator_
FOREST = {'model': 'FOREST' ,'precision':precision_score(y_test, y_pred, average='macro'),
'recall':recall_score(y_test, y_pred, average='macro'),
'accuracy': accuracy_score(y_test, y_pred),
'f1': f1_score(y_test, y_pred , average = 'binary'),
'ROC AUC': roc_auc_score(y_test, y_pred) }
print('Forest')
evaluate_model(grid_search)
Fitting 5 folds for each of 336 candidates, totalling 1680 fits Forest Train Accuracy : 1.0 Train Confusion Matrix: [[11428 0] [ 0 11429]] -------------------------------------------------- Test Accuracy : 1.0 Test Confusion Matrix: [[3470 0] [ 0 1030]]
Plotting results from models
#Result dataset for dataset 1
TREE = pd.DataFrame(TREE, index=[0])
FOREST =pd.DataFrame(FOREST,index=[0])
NB = pd.DataFrame(NB,index=[0])
KNN = pd.DataFrame(KNN,index=[0])
ADA = pd.DataFrame(ADA,index=[0])
results = pd.concat([TREE, FOREST], axis=0)
results = pd.concat([results, NB], axis=0)
results = pd.concat([results, KNN], axis=0)
results = pd.concat([results, ADA], axis=0)
results['dataset'] = 'dataset 1'
print(results)
model precision recall accuracy f1 ROC AUC dataset 0 TREE 1.000000 1.000000 1.000000 1.00000 1.000000 dataset 1 0 FOREST 1.000000 1.000000 1.000000 1.00000 1.000000 dataset 1 0 NB 0.605927 0.628827 0.692889 0.43221 0.628827 dataset 1 0 KNN 1.000000 1.000000 1.000000 1.00000 1.000000 dataset 1 0 ADA 1.000000 1.000000 1.000000 1.00000 1.000000 dataset 1
#Dataset 2
data = pd.read_csv('C:/Users/santi/OneDrive/Desktop/IBM.csv')
data.columns
#checking values
for x in data.columns:
print(data[x].value_counts())
#checking null values
for x in data.columns:
a = data[x].isna().sum()
if a>0:
print(x + ' ' + a)
data = data.drop(columns=['EmployeeID', 'recorddate_key', 'birthdate_key', 'orighiredate_key',
'terminationdate_key',
'gender_full', 'termreason_desc', 'termtype_desc', 'STATUS_YEAR'])
1318 10
5169 10
5155 10
5157 10
5158 10
..
2568 1
2575 1
2578 1
2579 1
8264 1
Name: EmployeeID, Length: 6284, dtype: int64
12/31/2013 0:00 5215
12/31/2012 0:00 5101
12/31/2011 0:00 4972
12/31/2014 0:00 4962
12/31/2010 0:00 4840
...
09/01/2011 00:00 3
04/01/2015 00:00 3
08/01/2012 00:00 2
06/01/2014 00:00 2
07/01/2014 00:00 2
Name: recorddate_key, Length: 130, dtype: int64
3/23/1973 40
08/04/1954 40
4/27/1956 40
03/06/1956 30
7/13/1972 30
..
9/14/1941 1
09/12/1941 1
09/01/1941 1
8/29/1941 1
6/13/1994 1
Name: birthdate_key, Length: 5342, dtype: int64
9/25/2006 50
08/09/1992 50
2/26/2006 50
10/16/2005 50
12/04/2004 50
..
06/02/1993 1
07/09/1997 1
7/24/1997 1
7/25/1997 1
8/27/2013 1
Name: orighiredate_key, Length: 4415, dtype: int64
01/01/1900 42450
12/30/2014 1079
12/30/2015 674
12/30/2010 25
11/11/2012 21
...
11/13/2006 1
10/31/2006 1
10/30/2006 1
10/03/2006 1
9/14/2013 1
Name: terminationdate_key, Length: 1055, dtype: int64
27 1235
29 1227
28 1225
50 1218
30 1212
26 1210
51 1207
25 1197
49 1196
35 1189
34 1188
53 1188
52 1188
48 1180
36 1176
47 1173
55 1168
33 1164
46 1161
54 1159
44 1157
38 1156
56 1154
32 1153
42 1152
43 1150
37 1149
31 1146
39 1142
45 1141
41 1135
58 1130
57 1130
40 1130
59 1128
24 1111
60 1109
23 960
22 815
61 757
62 712
21 703
63 667
64 646
65 593
20 408
19 158
Name: age, dtype: int64
13 2885
12 2567
8 2559
11 2482
10 2432
9 2381
7 2341
6 2294
3 2270
4 2262
5 2258
2 2257
1 2222
14 2203
15 2192
16 2160
17 2066
0 1962
18 1829
19 1656
20 1322
21 1047
22 830
23 608
24 433
25 121
26 14
Name: length_of_service, dtype: int64
Vancouver 11211
Victoria 4885
Nanaimo 3876
New Westminster 3211
Kelowna 2513
Burnaby 2067
Kamloops 2061
Prince George 2048
Cranbrook 1785
Surrey 1560
Richmond 1401
Terrace 1228
Chilliwack 1167
Trail 925
Langley 901
Vernon 898
Squamish 806
Quesnel 703
Abbotsford 681
North Vancouver 648
Fort St John 621
Williams Lake 617
West Vancouver 613
Port Coquitlam 545
Aldergrove 520
Fort Nelson 322
Nelson 317
New Westminister 254
Grand Forks 236
White Rock 231
Haney 182
Princeton 136
Dawson Creek 129
Bella Bella 126
Ocean Falls 65
Pitt Meadows 57
Cortes Island 43
Valemount 37
Dease Lake 18
Blue River 9
Name: city_name, dtype: int64
Meats 10269
Dairy 8599
Produce 8515
Bakery 8381
Customer Service 7122
Processed Foods 5911
Store Management 271
Executive 100
Recruitment 72
HR Technology 64
Accounting 59
Employee Records 44
Accounts Receiveable 39
Labor Relations 34
Accounts Payable 34
Training 30
Compensation 24
Audit 24
Investment 24
Information Technology 20
Legal 17
Name: department_name, dtype: int64
Meat Cutter 9984
Dairy Person 8590
Produce Clerk 8237
Baker 8096
Cashier 6816
Shelf Stocker 5622
Customer Service Manager 306
Processed Foods Manager 289
Bakery Manager 285
Meats Manager 285
Produce Manager 278
Store Manager 271
Recruiter 62
HRIS Analyst 55
Accounting Clerk 50
Benefits Admin 35
Labor Relations Analyst 30
Accounts Receiveable Clerk 30
Trainer 26
Accounts Payable Clerk 25
Auditor 20
Systems Analyst 20
Investment Analyst 20
Compensation Analyst 20
Corporate Lawyer 17
CEO 10
Exec Assistant, VP Stores 10
Legal Counsel 10
VP Stores 10
VP Human Resources 10
VP Finance 10
Director, Recruitment 10
Exec Assistant, Finance 10
Exec Assistant, Human Resources 10
CHief Information Officer 10
Exec Assistant, Legal Counsel 10
Director, Accounts Payable 9
Director, Accounts Receivable 9
Director, Employee Records 9
Director, HR Technology 9
Dairy Manager 9
Director, Accounting 9
Director, Investments 4
Director, Labor Relations 4
Director, Compensation 4
Director, Audit 4
Director, Training 4
Name: job_title, dtype: int64
46 4422
18 3876
42 3827
21 3211
43 2896
16 2513
5 2067
15 2061
26 2048
8 1785
41 1765
31 1560
44 1520
29 1401
32 1228
6 1167
35 1143
33 925
17 901
36 898
30 806
28 703
1 681
22 648
12 621
40 617
38 613
25 545
2 520
37 463
11 322
19 317
20 254
13 236
39 231
14 182
27 136
9 129
3 126
23 65
45 60
24 57
7 43
34 37
10 18
4 9
Name: store_name, dtype: int64
F 25898
M 23755
Name: gender_short, dtype: int64
Female 25898
Male 23755
Name: gender_full, dtype: int64
Not Applicable 48168
Retirement 885
Resignaton 385
Layoff 215
Name: termreason_desc, dtype: int64
Not Applicable 48168
Voluntary 1270
Involuntary 215
Name: termtype_desc, dtype: int64
2013 5320
2012 5231
2014 5215
2011 5082
2010 4963
2015 4961
2009 4852
2008 4767
2007 4683
2006 4579
Name: STATUS_YEAR, dtype: int64
ACTIVE 48168
TERMINATED 1485
Name: STATUS, dtype: int64
STORES 49068
HEADOFFICE 585
Name: BUSINESS_UNIT, dtype: int64
Plotting and exploratory analysis
#charts and plots
g = sns.countplot(data=data, x="STATUS")
g.tick_params(labelsize=11)
g.set_title('Dataset 2 Attrition count plot')
plt.show()
dd = data
sns.countplot(data=dd, x='gender_short', hue='STATUS')
plt.show()
sns.countplot(data=dd, x='BUSINESS_UNIT', hue='STATUS')
plt.show()
sns.countplot(data=dd, x='department_name', hue='STATUS')
plt.xticks(rotation = 90)
plt.show()
sns.countplot(data=dd, x='age', hue= 'STATUS')
plt.xticks(rotation = 90)
plt.show()
bins = 10
ax1 = sns.histplot(data=dd, x="age", hue='STATUS', multiple="dodge",bins=bins,alpha=.8,kde=True)
sns.move_legend(ax1, "upper right")
ax1.set_title('Age count plot and status')
plt.show()
male = dd[dd['gender_short']=='M']
fem = dd[dd['gender_short']=='F']
ter_M = male[male['STATUS']=='TERMINATED']
ter_F = fem[fem['STATUS']=='TERMINATED']
import matplotlib.ticker as mtick
plot = {'Gender': ('Female','Male') , 'Attrition rate': (len(ter_F) / len(fem) * 100, len(ter_M) / len(male) * 100)}
plot = pd.DataFrame(plot)
fig = sns.barplot(data = plot, x='Gender',y='Attrition rate')
fig.yaxis.set_major_formatter(mtick.PercentFormatter(100))
plt.title("Attrition rate by gender")
plt.show()
#encoding for charts
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for x in dd.columns:
label = le.fit_transform(dd[x])
dd.drop(x, axis=1, inplace=True)
dd[x] = label
sns.pairplot(data=dd, hue='STATUS')
#ditribution plot
df1 = dd.iloc[: , :6]
df2 = dd.iloc[: , 6:12]
sns.color_palette("tab10")
n_rows=2
n_cols=3
palette = ('dodgerblue' ,'red','blue' ,'orange','black' ,'purple')
palette = iter(palette)
# Create the subplots
fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols,figsize=(11, 8))
for i, column in enumerate(df1.columns):
c = next(palette)
sns.distplot(df1[column],ax=axes[i//n_cols,i%n_cols],color=c)
n_rows=2
n_cols=3
palette = ('green' ,'black','deeppink' ,'magenta','cyan' ,'deeppink')
palette = iter(palette)
# Create the subplots
fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols,figsize=(11, 8))
for i, column in enumerate(df2.columns):
c = next(palette)
sns.distplot(df2[column],ax=axes[i//n_cols,i%n_cols],color=c)
plt.show()
data['STATUS'] = data['STATUS'].replace('ACTIVE',0)
data['STATUS'] = data['STATUS'].replace('TERMINATED',1)
C:\Users\santi\AppData\Local\Temp\ipykernel_17924\1472844368.py:70: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df1[column],ax=axes[i//n_cols,i%n_cols],color=c) C:\Users\santi\AppData\Local\Temp\ipykernel_17924\1472844368.py:70: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df1[column],ax=axes[i//n_cols,i%n_cols],color=c) C:\Users\santi\AppData\Local\Temp\ipykernel_17924\1472844368.py:70: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df1[column],ax=axes[i//n_cols,i%n_cols],color=c) C:\Users\santi\AppData\Local\Temp\ipykernel_17924\1472844368.py:70: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df1[column],ax=axes[i//n_cols,i%n_cols],color=c) C:\Users\santi\AppData\Local\Temp\ipykernel_17924\1472844368.py:70: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df1[column],ax=axes[i//n_cols,i%n_cols],color=c) C:\Users\santi\AppData\Local\Temp\ipykernel_17924\1472844368.py:70: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df1[column],ax=axes[i//n_cols,i%n_cols],color=c) C:\Users\santi\AppData\Local\Temp\ipykernel_17924\1472844368.py:80: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df2[column],ax=axes[i//n_cols,i%n_cols],color=c) C:\Users\santi\AppData\Local\Temp\ipykernel_17924\1472844368.py:80: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df2[column],ax=axes[i//n_cols,i%n_cols],color=c) C:\Users\santi\AppData\Local\Temp\ipykernel_17924\1472844368.py:80: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(df2[column],ax=axes[i//n_cols,i%n_cols],color=c)
#X and Y
y = data['STATUS']
x = data.drop(columns=['STATUS'])
#Leave one out encoder
import category_encoders as ce
ce = ce.LeaveOneOutEncoder(cols=['city_name','department_name', 'store_name',
'gender_short','BUSINESS_UNIT','job_title'],return_df=True)
x = ce.fit_transform(x,y)
# decision tree for feature importance on a classification problem
model = RandomForestClassifier()
# fit the model
model.fit(x,y)
# get importance
importance = model.feature_importances_
# summarize feature importance
for i,v in enumerate(importance):
print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
plt.bar([x for x in range(len(importance))], importance)
plt.show()
x = x[['age','gender_short','BUSINESS_UNIT']]
Feature: 0, Score: 0.17533 Feature: 1, Score: 0.00395 Feature: 2, Score: 0.02174 Feature: 3, Score: 0.02162 Feature: 4, Score: 0.01399 Feature: 5, Score: 0.02186 Feature: 6, Score: 0.16474 Feature: 7, Score: 0.57678
Train test split , multicolinearity and resampling
#split
X_train, X_test, y_train, y_test = train_test_split(x, y,test_size=0.30)
#adasyn resampling
print("Before resampling: ", Counter(y_train))
from imblearn.over_sampling import ADASYN
ada = ADASYN(random_state=42)
X_train, y_train= ada.fit_resample(X_train,y_train)
print("After ADASYN: ", Counter(y_train))
#correlation matrix
cor = X_train.corr()
plt.figure(figsize=(12,10))
sns.heatmap(cor, cmap=plt.cm.CMRmap_r,annot=True)
plt.show()
Before resampling: Counter({0: 33715, 1: 1042})
After ADASYN: Counter({1: 33719, 0: 33715})
Ada Boosting and KNN
#ADA
# Create adaboost classifer object
abc = AdaBoostClassifier(n_estimators=50,
learning_rate=1)
# Train Adaboost Classifer
model = abc.fit(X_train, y_train)
#Predict the response for test dataset
y_pred = model.predict(X_test)
#cv
#create a new KNN model
#train model with cv of 5
cv_scores = cross_val_score(abc, X_train, y_train, cv=5)
#print each cv score (accuracy) and average them
print(cv_scores)
print('cv_scores mean:{}'.format(np.mean(cv_scores)))
ADA2 = {'model': 'ADA' ,'precision':precision_score(y_test, y_pred, average='macro'),
'recall':recall_score(y_test, y_pred, average='macro'),
'accuracy': accuracy_score(y_test, y_pred),
'f1': f1_score(y_test, y_pred , average = 'binary'),
'ROC AUC': roc_auc_score(y_test, y_pred) }
print('ADA')
CM_ADA2 = confucio(y_test, y_pred)
evaluate_model(abc)
#knn
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
print(knn.score(X_test, y_test))
#Checking for k value
neighbors = np.arange(1, 9)
train_accuracy = np.empty(len(neighbors))
test_accuracy = np.empty(len(neighbors))
# Loop over K values
for i, k in enumerate(neighbors):
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train, y_train)
# Compute training and test data accuracy
train_accuracy[i] = knn.score(X_train, y_train)
test_accuracy[i] = knn.score(X_test, y_test)
# Generate plot
plt.plot(neighbors, test_accuracy, label = 'Testing dataset Accuracy')
plt.plot(neighbors, train_accuracy, label = 'Training dataset Accuracy')
plt.legend()
plt.xlabel('n_neighbors')
plt.ylabel('Accuracy')
plt.show()
#cv
#create a new KNN model
knn_cv = KNeighborsClassifier(n_neighbors=5)
knn_cv.fit(X_train, y_train)
#train model with cv of 5
cv_scores = cross_val_score(knn_cv, X_train, y_train, cv=5)
#print each cv score (accuracy) and average them
print(cv_scores)
print('cv_scores mean:{}'.format(np.mean(cv_scores)))
print('knn')
evaluate_model(knn_cv)
KNN2 = {'model': 'KNN' ,'precision':precision_score(y_test, y_pred, average='macro'),
'recall':recall_score(y_test, y_pred, average='macro'),
'accuracy': accuracy_score(y_test, y_pred),
'f1': f1_score(y_test, y_pred , average = 'binary'),
'ROC AUC': roc_auc_score(y_test, y_pred) }
CM_KNN2 = confucio(y_test, y_pred)
[0.9967376 1. 1. 1. 1. ] cv_scores mean:0.9993475198339141 ADA Train Accuracy : 1.0 Train Confusion Matrix: [[33715 0] [ 0 33719]] -------------------------------------------------- Test Accuracy : 1.0 Test Confusion Matrix: [[14453 0] [ 0 443]] 0.9990601503759399
[0.923037 0.99822051 0.98635723 0.99733076 0.98413169] cv_scores mean:0.9778154390404408 knn Train Accuracy : 0.9995699498769167 Train Confusion Matrix: [[33714 1] [ 28 33691]] -------------------------------------------------- Test Accuracy : 0.9990601503759399 Test Confusion Matrix: [[14451 2] [ 12 431]]
Naive Bayes and Decision Tree
#NB
nb_model = Pipeline([
('classification', GaussianNB())
])
nb_model.get_params().keys()
nb_clf = GridSearchCV(estimator=nb_model, param_grid={}, scoring='recall', cv=5)
nb_clf.fit(X_train, y_train)
y_pred = nb_clf.predict(X_test)
model_nb_cm = confucio(y_test, y_pred)
NB2 = {'model': 'NB' ,'precision':precision_score(y_test, y_pred, average='macro'),
'recall':recall_score(y_test, y_pred, average='macro'),
'accuracy': accuracy_score(y_test, y_pred),
'f1': f1_score(y_test, y_pred , average = 'binary'),
'ROC AUC': roc_auc_score(y_test, y_pred) }
CM_NB2 = confucio(y_test, y_pred)
print('NB')
evaluate_model(nb_clf)
#Decision tree
clf = tree.DecisionTreeClassifier(criterion='entropy', splitter='best', max_depth=3)
clf = clf.fit(X_train, y_train)
tree.plot_tree(clf)
y_pred= clf.predict(X_test)
model_tree_cm = confucio(y_test, y_pred)
#cv
#train model with cv of 5
cv_scores = cross_val_score(clf, X_train, y_train, cv=5)
#print each cv score (accuracy) and average them
print(cv_scores)
print('cv_scores mean:{}'.format(np.mean(cv_scores)))
TREE2 = {'model': 'TREE' ,'precision':precision_score(y_test, y_pred, average='macro'),
'recall':recall_score(y_test, y_pred, average='macro'),
'accuracy': accuracy_score(y_test, y_pred),
'f1': f1_score(y_test, y_pred , average = 'binary'),
'ROC AUC': roc_auc_score(y_test, y_pred) }
print('TREE')
CM_TREE2 = confucio(y_test, y_pred)
evaluate_model(clf)
NB Train Accuracy : 0.5047898686122727 Train Confusion Matrix: [[ 365 33350] [ 44 33675]] -------------------------------------------------- Test Accuracy : 0.038198174006444686 Test Confusion Matrix: [[ 151 14302] [ 25 418]] [0.9967376 1. 1. 1. 1. ] cv_scores mean:0.9993475198339141 TREE Train Accuracy : 1.0 Train Confusion Matrix: [[33715 0] [ 0 33719]] -------------------------------------------------- Test Accuracy : 1.0 Test Confusion Matrix: [[14453 0] [ 0 443]]
Random Forest
#forest
dt = RandomForestClassifier(random_state=42)
params = {
'max_depth': [1,2,3,4,5,6,7,8,9,10,15, 20],
'min_samples_leaf': [1,2,3,4,5,6,7,8,9, 10,15, 20, 50, 100],
'criterion': ["gini", "entropy"]
}
grid_search = GridSearchCV(estimator=dt,
param_grid=params,
cv=5, n_jobs=-1, verbose=1, scoring = "accuracy")
grid_search.fit(X_train, y_train)
score_df = pd.DataFrame(grid_search.cv_results_)
score_df.head()
score_df.nlargest(5,"mean_test_score")
dt_best = grid_search.best_estimator_
FOREST2 = {'model': 'FOREST' ,'precision':precision_score(y_test, y_pred, average='macro'),
'recall':recall_score(y_test, y_pred, average='macro'),
'accuracy': accuracy_score(y_test, y_pred),
'f1': f1_score(y_test, y_pred , average = 'binary'),
'ROC AUC': roc_auc_score(y_test, y_pred) }
CM_FOREST2 = confucio(y_test, y_pred)
print('Forest')
evaluate_model(grid_search)
Fitting 5 folds for each of 336 candidates, totalling 1680 fits Forest Train Accuracy : 0.9993475101580805 Train Confusion Matrix: [[33715 0] [ 44 33675]] -------------------------------------------------- Test Accuracy : 0.9983216970998926 Test Confusion Matrix: [[14453 0] [ 25 418]]
TREE2 = pd.DataFrame(TREE2, index=[0])
FOREST2 =pd.DataFrame(FOREST2,index=[0])
NB2 = pd.DataFrame(NB2,index=[0])
KNN2 = pd.DataFrame(KNN2,index=[0])
ADA2 = pd.DataFrame(ADA2,index=[0])
results2 = pd.concat([TREE2, FOREST2], axis=0)
results2 = pd.concat([results2, NB2], axis=0)
results2 = pd.concat([results2, KNN2], axis=0)
results2 = pd.concat([results2, ADA2], axis=0)
results2['dataset'] = 'dataset 2'
results_final = pd.concat([results2, results], axis=0)
results_final = results_final.melt(id_vars =['model','dataset'],
value_vars =['precision','recall','f1','ROC AUC'], var_name = 'metric')
results_final.rename(columns = {'value':'score'}, inplace = True)
print(results_final)
fig,ax = plt.subplots(1,2,figsize=(20, 10))
plt.suptitle('FInal scores', fontsize=25)
a = sns.barplot(data=results_final[results_final['dataset']=='dataset 1'], x='model',y='score',hue= 'metric',ax=ax[0])
a = sns.barplot(data=results_final[results_final['dataset']=='dataset 2'], x='model',y='score',hue= 'metric',ax=ax[1])
model dataset metric score 0 TREE dataset 2 precision 1.000000 1 FOREST dataset 2 precision 1.000000 2 NB dataset 2 precision 0.443176 3 KNN dataset 2 precision 1.000000 4 ADA dataset 2 precision 1.000000 5 TREE dataset 1 precision 1.000000 6 FOREST dataset 1 precision 1.000000 7 NB dataset 1 precision 0.605927 8 KNN dataset 1 precision 1.000000 9 ADA dataset 1 precision 1.000000 10 TREE dataset 2 recall 1.000000 11 FOREST dataset 2 recall 1.000000 12 NB dataset 2 recall 0.477007 13 KNN dataset 2 recall 1.000000 14 ADA dataset 2 recall 1.000000 15 TREE dataset 1 recall 1.000000 16 FOREST dataset 1 recall 1.000000 17 NB dataset 1 recall 0.628827 18 KNN dataset 1 recall 1.000000 19 ADA dataset 1 recall 1.000000 20 TREE dataset 2 f1 1.000000 21 FOREST dataset 2 f1 1.000000 22 NB dataset 2 f1 0.055134 23 KNN dataset 2 f1 1.000000 24 ADA dataset 2 f1 1.000000 25 TREE dataset 1 f1 1.000000 26 FOREST dataset 1 f1 1.000000 27 NB dataset 1 f1 0.432210 28 KNN dataset 1 f1 1.000000 29 ADA dataset 1 f1 1.000000 30 TREE dataset 2 ROC AUC 1.000000 31 FOREST dataset 2 ROC AUC 1.000000 32 NB dataset 2 ROC AUC 0.477007 33 KNN dataset 2 ROC AUC 1.000000 34 ADA dataset 2 ROC AUC 1.000000 35 TREE dataset 1 ROC AUC 1.000000 36 FOREST dataset 1 ROC AUC 1.000000 37 NB dataset 1 ROC AUC 0.628827 38 KNN dataset 1 ROC AUC 1.000000 39 ADA dataset 1 ROC AUC 1.000000
AS shown in plot my models for both datasets were able to predict with hight accuracy